load("./data/olist_1.RData")
order$year = format(order$order_purchase_timestamp, "%Y") # 顧客下單年份
order$month = format(order$order_purchase_timestamp, "%m") # 顧客下單月份
order$week = format(order$order_purchase_timestamp, "%w") # 看顧客下單星期幾
order = order[, c(2,3,4,21,25,5,11:20,22:24,26:39)]
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
olist_marketing_qualified_leads_dataset$on = ifelse(olist_marketing_qualified_leads_dataset$mql_id %in% olist_closed_deals_dataset$mql_id,1,0)
olist_marketing_qualified_leads_dataset = subset(olist_marketing_qualified_leads_dataset, origin!= "unknown")
table(olist_marketing_qualified_leads_dataset$origin)%>% sort(decreasing = T) %>% head(5) %>% barplot(main = "潛在賣家獲取途徑", col = "blue")
tapply(olist_marketing_qualified_leads_dataset$on,olist_marketing_qualified_leads_dataset$origin, mean) %>% sort(decreasing = T)%>% head(5) %>% barplot(main = "潛在賣家獲取管道轉換率排名", col = "orange")
# 二、影響賣家原因及如何改善
library("d3heatmap")
order = na.omit(order)
table(order$month, order$review_score) %>%
prop.table(2) %>%
as.data.frame.matrix %>%
d3heatmap(F,F,col=colorRamp(c('seagreen','lightyellow','red')))
### 依照運送時間長度排序月份
x = subset(order, ctcust>11)
# table(x$customer_city) %>% sort(decreasing = T)
# table(x$customer_state) %>% sort(decreasing = T)
table(x$month) %>% sort(decreasing = T) %>%barplot(main = "運送時長前25%月份",col = "red")
#繪製運送時長前五名的城市
table(x$customer_city) %>% sort(decreasing = T) %>% head(5) %>%barplot(main = "運送時長前5名城市",col = "green")
#table(order$customer_state) %>% sort(decreasing = T)
#table(x$customer_state) %>% sort(decreasing = T)
library(readr)
#將order_item以賣家做整理
items <- read_csv("data/olist_order_items_dataset.csv")
## Parsed with column specification:
## cols(
## order_id = col_character(),
## order_item_id = col_double(),
## product_id = col_character(),
## seller_id = col_character(),
## shipping_limit_date = col_datetime(format = ""),
## price = col_double(),
## freight_value = col_double()
## )
seller_information <- items %>%
group_by(seller_id) %>%
summarise(
items = n(),
amount = sum(price),
products = n_distinct(product_id),
avg_price = mean(price),
max_price = max(price),
min_price = min(price),
avg_freight = mean(freight_value),
avg_amount_by_product = amount/products,
avg_item_by_product = products/items
) %>%
arrange(desc(amount))
reviews <- read_csv("./data/olist_order_reviews_dataset.csv")
## Parsed with column specification:
## cols(
## review_id = col_character(),
## order_id = col_character(),
## review_score = col_double(),
## review_comment_title = col_character(),
## review_comment_message = col_character(),
## review_creation_date = col_datetime(format = ""),
## review_answer_timestamp = col_datetime(format = "")
## )
review_information <- unique(items[, c("order_id", "seller_id")]) %>%
left_join(reviews, by = c("order_id")) %>%
group_by(seller_id) %>%
summarise(
count_reviews = n(),
avg_score = mean(review_score),
min_score = min(review_score),
max_score = max(review_score)
)
summary(review_information)
## seller_id count_reviews avg_score min_score
## Length:3095 Min. : 1.0 Min. :1.000 Min. :1.000
## Class :character 1st Qu.: 2.0 1st Qu.:3.750 1st Qu.:1.000
## Mode :character Median : 7.0 Median :4.172 Median :1.000
## Mean : 32.5 Mean :3.983 Mean :2.246
## 3rd Qu.: 22.0 3rd Qu.:4.600 3rd Qu.:4.000
## Max. :1860.0 Max. :5.000 Max. :5.000
## max_score
## Min. :1.000
## 1st Qu.:5.000
## Median :5.000
## Mean :4.685
## 3rd Qu.:5.000
## Max. :5.000
sellers <- read_csv("./data/olist_sellers_dataset.csv")
## Parsed with column specification:
## cols(
## seller_id = col_character(),
## seller_zip_code_prefix = col_character(),
## seller_city = col_character(),
## seller_state = col_character()
## )
seller_summary <- seller_information %>%
left_join(review_information, by = "seller_id") %>%
left_join(sellers, by = "seller_id")
sapply(seller_summary, function(x) sum(is.na(x)))
## seller_id items amount
## 0 0 0
## products avg_price max_price
## 0 0 0
## min_price avg_freight avg_amount_by_product
## 0 0 0
## avg_item_by_product count_reviews avg_score
## 0 0 0
## min_score max_score seller_zip_code_prefix
## 0 0 0
## seller_city seller_state
## 0 0
head(seller_summary)
orders <- read_csv("data/olist_orders_dataset.csv")
## Parsed with column specification:
## cols(
## order_id = col_character(),
## customer_id = col_character(),
## order_status = col_character(),
## order_purchase_timestamp = col_datetime(format = ""),
## order_approved_at = col_datetime(format = ""),
## order_delivered_carrier_date = col_datetime(format = ""),
## order_delivered_customer_date = col_datetime(format = ""),
## order_estimated_delivery_date = col_datetime(format = "")
## )
orders_delay <- orders %>%
mutate(delay = order_delivered_customer_date > order_estimated_delivery_date) %>%
select(order_id, delay) %>%
filter(!is.na(delay)) %>%
left_join(items, by = "order_id") %>%
select(seller_id, order_id, delay) %>%
group_by(seller_id) %>%
summarise(total_delay = sum(delay))
seller_summary <- seller_summary %>%
inner_join(orders_delay, by = "seller_id")
closed_deals <- read.csv("data/olist_closed_deals_dataset.csv")
seller_segment <- closed_deals %>%
select(seller_id, business_segment)
seller_summary <- seller_summary %>%
inner_join(seller_segment, by = "seller_id")
## Warning: Column `seller_id` joining character vector and factor, coercing
## into character vector
seller_summary
library(ggplot2)
A <- seller_summary %>%
select(items, amount, avg_price, avg_freight, avg_score, count_reviews, total_delay)
seller_clus <- seller_summary %>%
select(items, amount, avg_price, avg_freight, avg_score, count_reviews, total_delay) %>%
scale() %>%
data.frame()
hc = hclust(dist(seller_clus, method="euclidean"), method='ward.D')
plot(hc)
rect.hclust(hc, k = 3, border = "red")
### 將分成三群的賣家,繪製成長條圖,並觀察三群的表現
kg = cutree(hc, k = 3)
table(kg)
## kg
## 1 2 3
## 47 208 121
sapply(split(A, kg), colMeans) %>% round(2)
## 1 2 3
## items 24.36 15.92 4.82
## amount 6570.72 1530.53 402.45
## avg_price 644.96 106.52 89.15
## avg_freight 55.40 19.61 17.40
## avg_score 4.38 3.95 4.89
## count_reviews 22.13 14.23 4.30
## total_delay 2.32 1.17 0.08
par(cex=0.8)
split(seller_clus, kg) %>%
sapply(colMeans) %>%
barplot(beside = T, col = rainbow(7))
legend('topright',legend=colnames(A),fill=rainbow(7))
### 將分群資訊加回原賣家資料中,可知道各個賣家屬於哪一群
seller_summary$cluster <- factor(kg)
seller_summary %>%
group_by(cluster) %>%
summarise(
count = n(),
revenue = sum(amount),
avg_item_sold = mean(items),
avg_price = revenue/avg_item_sold,
avg_amount = mean(amount),
avg_review = mean(count_reviews),
avg_score = mean(avg_score),
avg_delay = mean(total_delay)
)
B <- table(seller_summary$cluster, seller_summary$business_segment) %>% data.frame
ggplot(seller_summary, aes(items, amount, color = avg_score)) +
geom_point(alpha = 0.8) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~cluster)
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
#google地圖平台API
register_google(key = "AIzaSyCcWGW1EnWZwLzGqP74f28LySULFDWbiHQ")
#畫出地圖
map <- get_map(location = c('Brasil'), zoom = 5,language = "zh-TW",maptype = "roadmap")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=Brasil&zoom=5&size=640x640&scale=2&maptype=roadmap&language=zh-TW&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Brasil&key=xxx
#讀入地理資訊
geo = read.csv("./data/olist_geolocation_dataset.csv")
names(geo)[names(geo)=="geolocation_zip_code_prefix"]="zip_code_prefix"
names(geo)[names(geo)=="geolocation_city"]="city"
names(geo)[names(geo)=="geolocation_state"]="state"
#由於同一個zip_code有許多不同經緯度
#為方便處理及繪圖,一個zip_code只取中位數經緯度
newgeo=group_by(geo,zip_code_prefix)
da = summarise(newgeo,lat=median(geolocation_lat),lng=median(geolocation_lng))
#將賣家資訊和地理資訊資料合併
group3_geo = seller_summary[,c("seller_id","seller_zip_code_prefix","cluster")]
group3 = merge(group3_geo,da,by.x="seller_zip_code_prefix",by.y="zip_code_prefix",all.x = TRUE)
#將賣家依照分群繪製成點圖在地圖上
ggmap(map)+
geom_point(aes(x = lng, y = lat, col = factor(cluster)),size=1,data = group3, alpha = 0.6)
## Warning: Removed 163 rows containing missing values (geom_point).
## 繪製遲交次數高之賣家
#late為已整理過之遲交次數資料
late = read.csv("./data/latefreq.csv")
late = merge(late,sellers,by="seller_id",all.x = TRUE)
late = merge(late,da,by.x="seller_zip_code_prefix",by.y="zip_code_prefix",all.x = TRUE)
late$freq_fac=NA
late[late$late_frq<30,]$freq_fac="0~30"
late[late$late_frq>30&late$late_frq<=50,]$freq_fac="31~50"
late[late$late_frq>50&late$late_frq<=100,]$freq_fac="51~100"
late[late$late_frq>100,]$freq_fac=">100"
late = filter(late,late_frq>50)
late = na.omit(late)
ggmap(map)+
geom_point(aes(x = lng, y = lat, col = freq_fac),size=2,data = late, alpha = 0.6)